Setup

Parameters

print(params)
## $counts.path
## [1] "../../data/counts"
## 
## $sample.metadata.path
## [1] "../../data/sample_metadata"
## 
## $gene.metadata.path
## [1] "../../data/gene_metadata"
## 
## $output.rds.path
## [1] "../../results/01_Filter_Samples"
## 
## $umi.counts
## [1] TRUE
## 
## $species
## [1] "DRE"
## 
## $species.name
## [1] "Danio rerio"
## 
## $min.genes
## [1] 10000
## 
## $min.reads
## [1] 5e+05
## 
## $min.replicates
## [1] 4
## 
## $min.cpm
## [1] 1
## 
## $normalize.by.condition
## [1] TRUE
## 
## $set.scale
## [1] TRUE

Libraries

library(dplyr)

Functions

source("../functions/helper_functions.R")
source("../functions/filter_samples.R")
source("../functions/filter_samples_plots.R")

Session Info

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 11.7.8
## 
## Matrix products: default
## LAPACK: /Users/cbucao/miniforge3/envs/fish-ev-gene-function/lib/libopenblasp-r0.3.25.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
## [1] viridis_0.6.3        viridisLite_0.4.1    gplots_3.1.3         ggplot2_3.3.6        matrixStats_0.62.0  
## [6] edgeR_3.34.1         limma_3.48.3         colorBlindness_0.1.9 dplyr_1.0.9         
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.10         locfit_1.5-9.7      ape_5.8             lattice_0.21-8      tidyr_1.2.1        
##  [6] gtools_3.9.4        digest_0.6.37       utf8_1.2.3          R6_2.5.1            evaluate_1.0.1     
## [11] highr_0.10          pillar_1.7.0        yulab.utils_0.1.8   rlang_1.1.4         lazyeval_0.2.2     
## [16] jquerylib_0.1.4     rmarkdown_2.22      slider_0.2.2        labeling_0.4.2      munsell_0.5.0      
## [21] compiler_4.1.0      xfun_0.39           pkgconfig_2.0.3     gridGraphics_0.5-1  htmltools_0.5.5    
## [26] tidyselect_1.2.0    tibble_3.1.7        gridExtra_2.3       fansi_1.0.4         withr_3.0.2        
## [31] crayon_1.5.3        bitops_1.0-7        grid_4.1.0          nlme_3.1-162        jsonlite_1.8.9     
## [36] gtable_0.3.3        lifecycle_1.0.4     DBI_1.1.3           magrittr_2.0.3      scales_1.2.1       
## [41] warp_0.2.0          KernSmooth_2.23-21  tidytree_0.4.6      cli_3.6.3           cachem_1.0.8       
## [46] farver_2.1.1        renv_0.17.3         fs_1.6.5            bslib_0.5.0         ellipsis_0.3.2     
## [51] generics_0.1.3      vctrs_0.4.1         cowplot_1.1.1       tools_4.1.0         treeio_1.16.2      
## [56] glue_1.8.0          purrr_0.3.5         parallel_4.1.0      fastmap_1.1.1       yaml_2.3.7         
## [61] colorspace_2.1-0    BiocManager_1.30.21 caTools_1.18.2      knitr_1.43          sass_0.4.6

Data

Metadata

# Sample metadata
metadata <- 
  read.delim(file.path(params$sample.metadata.path, 
                       paste(params$species, "sample_annotation.tsv", sep="_")), 
             sep="\t") %>%
  dplyr::arrange(Tissue, IndID, .by_group=TRUE)

# Gene metadata
biotype <- 
  read.delim(file.path(params$gene.metadata.path, 
                       paste(params$species, "biotype_Ensembl_105.txt", sep="_")), 
             sep="\t")

UMI counts

# Load counts
counts <- 
  vector("list", length(levels(as.factor(metadata$ProjectSeqBatch))))
names(counts) <- levels(as.factor(metadata$ProjectSeqBatch))

if (params$umi.counts==TRUE) {
  counts <- lapply(setNames(names(counts),names(counts)), function(batch) {
    read.delim(file.path(params$counts.path, 
                         paste(params$species, batch, "UMI_counts.txt", sep="_")), 
               sep="\t")
  })  
} else {
  counts <- lapply(setNames(names(counts),names(counts)), function(batch) {
    read.delim(file.path(params$counts.path, 
                         paste(params$species, batch, "counts.txt", sep="_")), 
               sep="\t")
  })
}

# Map barcodes to sample names
counts <- 
  lapply(setNames(names(counts), names(counts)), function(batch) {
    map_barcodes_to_sample_names(counts[[batch]], 
                                 metadata[metadata$ProjectSeqBatch==batch,])
  }) %>%
  Reduce(function(...) merge(..., by="GeneID"), .) %>%
  tibble::column_to_rownames("GeneID") %>%
  as.matrix()

counts <- counts[,metadata$SampleName]

data.init <- list("metadata"=metadata, "counts"=counts)
rm(counts, metadata)

Main

Before filtering

Summarize number of mapped reads and number of detected genes (counts > 0)

data.init$metadata <- 
  dplyr::inner_join(data.init$metadata, 
                    summarize_sequencing_results(data.init$counts), by="SampleName")

Plot - Detected genes x Mapped reads

plot_detected_genes_vs_mapped_reads(
  data.init$metadata, params$min.genes, params$min.reads, params$set.scale)

Initial gene filtering and normalization

data.init$counts.gf <-
  filter_genes_by_cpm_per_condition(data.init$counts, data.init$metadata, min.cpm=params$min.cpm)

# Normalize across all conditions
data.init$tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.init$counts.gf, data.init$metadata, log2=FALSE)

data.init$log2.tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.init$counts.gf, data.init$metadata, 
                              log2=TRUE, shift.min=TRUE)

Plot - PCA

Check sample clustering before removing low quality samples

data.init$pca <- run_pca(data.init$log2.tmm.cpm.gf)

By mapped reads

plot_pca_by_mapped_reads(data.init$pca, data.init$metadata, params$min.reads)
## Warning: Removed 10 rows containing missing values (geom_point).

plot_pca_by_mapped_reads_gradient(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).

By detected genes

plot_pca_by_detected_genes(data.init$pca, data.init$metadata, params$min.genes)
## Warning: Removed 10 rows containing missing values (geom_point).

plot_pca_by_detected_genes_gradient(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).

By batch

plot_pca_by_batch(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).

By condition

plot_pca_by_condition(data.init$pca, data.init$metadata)
## Warning: Removed 10 rows containing missing values (geom_point).

plot_pca_by_condition_pc34(data.init$pca, data.init$metadata)

Filter 1: Sequencing quality

Remove samples with less than minimum detected genes OR less than minimum mapped reads

data.qc1.seq <- 
  filter_samples_by_detected_genes_mapped_reads(
    data.init$counts, data.init$metadata, params$min.genes, params$min.reads)

# Remove tissue-sex conditions with few replicates remaining
data.qc1.seq <-
  filter_conditions_by_min_replicates(
    data.qc1.seq$counts, data.qc1.seq$metadata, params$min.replicates)
## [1] "Removing conditions:"
## [1] "F[0-9][0-9]_heart"        "M[0-9][0-9]_heart"        "F[0-9][0-9]_muscle"       "M[0-9][0-9]_muscle"      
## [5] "F[0-9][0-9]_swim_bladder" "M[0-9][0-9]_swim_bladder"
data.qc1.seq$counts.gf <-
  filter_genes_by_cpm_per_condition(data.qc1.seq$counts, data.qc1.seq$metadata, min.cpm=params$min.cpm)

# Normalize across all conditions
data.qc1.seq$tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.qc1.seq$counts.gf, data.qc1.seq$metadata, log2=FALSE)

data.qc1.seq$log2.tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.qc1.seq$counts.gf, data.qc1.seq$metadata,
                              log2=TRUE, shift.min=TRUE)

plot_detected_genes_vs_mapped_reads(data.qc1.seq$metadata, params$min.genes, params$min.reads)

Plot - PCA

data.qc1.seq$pca <- run_pca(data.qc1.seq$log2.tmm.cpm.gf)

By mapped reads

plot_pca_by_mapped_reads_gradient(data.qc1.seq$pca, data.qc1.seq$metadata)

plot_pca_by_detected_genes_gradient(data.qc1.seq$pca, data.qc1.seq$metadata)

By detected genes

plot_pca_by_detected_genes_gradient(data.qc1.seq$pca, data.qc1.seq$metadata)

By batch

plot_pca_by_batch(data.qc1.seq$pca, data.qc1.seq$metadata)

By condition

plot_pca_by_condition(data.qc1.seq$pca, data.qc1.seq$metadata)

plot_pca_by_condition_pc34(data.qc1.seq$pca, data.qc1.seq$metadata)

Plot - Correlation heatmap

data.qc1.seq$cor.matrix <- cor(data.qc1.seq$log2.tmm.cpm.gf, method="pearson")

plot_correlation_heatmap(data.qc1.seq$cor.matrix, data.qc1.seq$metadata, clust=TRUE)

Filter 2: Within-tissue correlation

Iterative removal of low correlation samples

data.qc2.cor <- 
  iterate_filter_samples_by_correlation(
    data.qc1.seq$counts, data.qc1.seq$metadata, min.cpm=params$min.cpm, round=2)
## [1] "Filtering and normalizing counts..."
## [1] "Removing samples:"
## [1] "F04_pectoral_fin"
## [1] "Filtering and normalizing counts..."
## [1] "No samples to remove"
# Remove tissue-sex conditions with few replicates remaining
data.qc2.cor <- 
  filter_conditions_by_min_replicates(
    data.qc2.cor$counts, data.qc2.cor$metadata, params$min.replicates)
## [1] "No conditions to remove"
data.qc2.cor$counts.gf <-
  filter_genes_by_cpm_per_condition(
    data.qc2.cor$counts, data.qc2.cor$metadata, min.cpm=params$min.cpm)

# Normalize across all conditions
data.qc2.cor$tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.qc2.cor$counts.gf, data.qc2.cor$metadata, log2=FALSE)

data.qc2.cor$log2.tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.qc2.cor$counts.gf, data.qc2.cor$metadata, 
                              log2=TRUE, shift.min=TRUE)

Plot - PCA

data.qc2.cor$pca <- run_pca(data.qc2.cor$log2.tmm.cpm.gf)

By batch

plot_pca_by_batch(data.qc2.cor$pca, data.qc2.cor$metadata)

By condition

plot_pca_by_condition(data.qc2.cor$pca, data.qc2.cor$metadata)

plot_pca_by_condition_pc34(data.qc2.cor$pca, data.qc2.cor$metadata)

Plot - Correlation heatmap

data.qc2.cor$cor.matrix <- cor(data.qc2.cor$log2.tmm.cpm.gf, method="pearson")

plot_correlation_heatmap(data.qc2.cor$cor.matrix, data.qc2.cor$metadata, clust=TRUE)

Boxplot - Detected genes per organ

boxplot_detected_genes(data.qc2.cor$metadata)

Gene biotype

Split genes by biotype

data.qc2.cor$coding$log2.tmm.cpm.gf <-
  data.qc2.cor$log2.tmm.cpm.gf[
    rownames(data.qc2.cor$log2.tmm.cpm.gf) 
        %in% biotype$Gene.stable.ID[biotype$Gene.type=="protein_coding"],]

data.qc2.cor$lncrna$log2.tmm.cpm.gf <-
  data.qc2.cor$log2.tmm.cpm.gf[
    rownames(data.qc2.cor$log2.tmm.cpm.gf) 
        %in% biotype$Gene.stable.ID[biotype$Gene.type=="lncRNA" | 
                                      biotype$Gene.type=="lincRNA"],]

# Protein-coding
print(nrow(data.qc2.cor$coding$log2.tmm.cpm.gf))
## [1] 19287
# lncRNA
print(nrow(data.qc2.cor$lncrna$log2.tmm.cpm.gf))
## [1] 479
# Neither
nrow(data.qc2.cor$log2.tmm.cpm.gf) - 
  (nrow(data.qc2.cor$coding$log2.tmm.cpm.gf) + nrow(data.qc2.cor$lncrna$log2.tmm.cpm.gf))
## [1] 590

Protein-coding

Plot - PCA

data.qc2.cor$coding$pca <- run_pca(data.qc2.cor$coding$log2.tmm.cpm.gf)
By batch
plot_pca_by_batch(data.qc2.cor$coding$pca, data.qc2.cor$metadata)

By condition
plot_pca_by_condition(data.qc2.cor$coding$pca, data.qc2.cor$metadata)

plot_pca_by_condition_pc34(data.qc2.cor$coding$pca, data.qc2.cor$metadata)

Plot - Correlation heatmap

data.qc2.cor$coding$cor.matrix <- 
  cor(data.qc2.cor$coding$log2.tmm.cpm.gf, method="pearson")

plot_correlation_heatmap(data.qc2.cor$coding$cor.matrix, 
                         data.qc2.cor$metadata, clust=TRUE)

lncRNA

Plot - PCA

data.qc2.cor$lncrna$pca <- run_pca(data.qc2.cor$lncrna$log2.tmm.cpm.gf)
By batch
plot_pca_by_batch(data.qc2.cor$lncrna$pca, data.qc2.cor$metadata)

By condition
plot_pca_by_condition(data.qc2.cor$lncrna$pca, data.qc2.cor$metadata)

plot_pca_by_condition_pc34(data.qc2.cor$lncrna$pca, data.qc2.cor$metadata)

Plot - Correlation heatmap

data.qc2.cor$lncrna$cor.matrix <- 
  cor(data.qc2.cor$lncrna$log2.tmm.cpm.gf, method="pearson")

plot_correlation_heatmap(data.qc2.cor$lncrna$cor.matrix, 
                         data.qc2.cor$metadata, clust=TRUE)

Reduced (all nonzero) expression matrix

data.qc2.cor.nonzero <- data.qc2.cor[c("conditions","metadata","counts")]

# Only keep genes with no zero counts across all samples
# to set the same number of 'expressed genes' per condition
data.qc2.cor.nonzero$counts.gf <-
  keep_full_nonzero_expression_matrix(data.qc2.cor.nonzero$counts)

# Normalize across all conditions
data.qc2.cor.nonzero$tmm.cpm.gf <-
  transform_counts_to_tmm_cpm(data.qc2.cor.nonzero$counts.gf, data.qc2.cor.nonzero$metadata, log2=FALSE)

data.qc2.cor.nonzero$log2.tmm.cpm.gf <- 
  transform_counts_to_tmm_cpm(data.qc2.cor.nonzero$counts.gf, data.qc2.cor.nonzero$metadata, 
                              log2=TRUE, shift.min=FALSE)

Plot - PCA

data.qc2.cor.nonzero$pca <- run_pca(data.qc2.cor.nonzero$log2.tmm.cpm.gf)

By batch

plot_pca_by_batch(data.qc2.cor.nonzero$pca, data.qc2.cor.nonzero$metadata)

By condition

plot_pca_by_condition(data.qc2.cor.nonzero$pca, data.qc2.cor.nonzero$metadata)

plot_pca_by_condition_pc34(data.qc2.cor.nonzero$pca, data.qc2.cor.nonzero$metadata)

Plot - Correlation heatmap

data.qc2.cor.nonzero$cor.matrix <- cor(data.qc2.cor.nonzero$log2.tmm.cpm.gf, method="pearson")

plot_correlation_heatmap(data.qc2.cor.nonzero$cor.matrix, data.qc2.cor.nonzero$metadata, clust=TRUE)

Normalization per condition

data.conditions <- vector(mode="list", length=6)
names(data.conditions) <- 
  c("conditions","metadata","counts","counts.gf","tmm.cpm.gf","log2.tmm.cpm.gf")

data.conditions$conditions <- data.qc2.cor$conditions
data.conditions$metadata <- data.qc2.cor$metadata

Tissue <- as.factor(data.qc2.cor$metadata$Tissue)

if ("Sex" %in% colnames(data.qc2.cor$metadata)) {
  Sex <- as.factor(data.qc2.cor$metadata$Sex)
  
  data.conditions$counts <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        subset_expr_matrix(data.qc2.cor$counts, tissue=t, sex=s)
      })
    })
  
  data.conditions$counts.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        filter_genes_by_cpm(data.conditions$counts[[t]][[s]], 
                            min.cpm=params$min.cpm, tmm=FALSE)
      })
    })

  data.conditions$tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        transform_counts_to_tmm_cpm(
          data.conditions$counts.gf[[t]][[s]],
          data.conditions$metadata[data.conditions$metadata$Tissue==t &
                                  data.conditions$metadata$Sex==s,],
          log2=FALSE)
      })
    })
    
  data.conditions$log2.tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        transform_counts_to_tmm_cpm(
          data.conditions$counts.gf[[t]][[s]],
          data.conditions$metadata[data.conditions$metadata$Tissue==t &
                                  data.conditions$metadata$Sex==s,],
          log2=TRUE, shift.min=TRUE)
      })
    })
  
} else {
  data.conditions$counts <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      subset_expr_matrix(data.qc2.cor$counts, tissue=t)
    })
  
  data.conditions$counts.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      filter_genes_by_cpm(data.conditions$counts[[t]], 
                          min.cpm=params$min.cpm, tmm=FALSE)
    })
  
  data.conditions$tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      transform_counts_to_tmm_cpm(
        data.conditions$counts.gf[[t]],
          data.conditions$metadata[data.conditions$metadata$Tissue==t,],
          log2=FALSE)
    })
  
  data.conditions$log2.tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      transform_counts_to_tmm_cpm(
        data.conditions$counts.gf[[t]],
          data.conditions$metadata[data.conditions$metadata$Tissue==t,],
          log2=TRUE, shift.min=TRUE)
    })
}

Reduced (all nonzero) expression matrix

data.conditions.nonzero <- vector(mode="list", length=6)
names(data.conditions.nonzero) <- 
  c("conditions","metadata","counts","counts.gf","tmm.cpm.gf","log2.tmm.cpm.gf")

data.conditions.nonzero$conditions <- data.qc2.cor$conditions
data.conditions.nonzero$metadata <- data.qc2.cor$metadata

Tissue <- as.factor(data.qc2.cor$metadata$Tissue)

if ("Sex" %in% colnames(data.qc2.cor$metadata)) {
  Sex <- as.factor(data.qc2.cor$metadata$Sex)
  
  data.conditions.nonzero$counts <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        subset_expr_matrix(data.qc2.cor$counts, tissue=t, sex=s)
      })
    })
  
  data.conditions.nonzero$counts.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        data.conditions.nonzero$counts[[t]][[s]][rownames(data.qc2.cor.nonzero$counts.gf),]
      })
    })

  data.conditions.nonzero$tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        transform_counts_to_tmm_cpm(
          data.conditions.nonzero$counts.gf[[t]][[s]],
          data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t &
                                  data.conditions.nonzero$metadata$Sex==s,],
          log2=FALSE)
      })
    })
    
  data.conditions.nonzero$log2.tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      lapply(setNames(levels(Sex), levels(Sex)), function(s) {
        transform_counts_to_tmm_cpm(
          data.conditions.nonzero$counts.gf[[t]][[s]],
          data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t &
                                  data.conditions.nonzero$metadata$Sex==s,],
          log2=TRUE, shift.min=TRUE)
      })
    })
  
} else {
  data.conditions.nonzero$counts <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      subset_expr_matrix(data.qc2.cor$counts, tissue=t)
    })
  
  data.conditions.nonzero$counts.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      data.conditions.nonzero$counts[[t]][rownames(data.qc2.cor.nonzero$counts.gf),]
    })
  
  data.conditions.nonzero$tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      transform_counts_to_tmm_cpm(
        data.conditions.nonzero$counts.gf[[t]],
          data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t,],
          log2=FALSE)
    })
  
  data.conditions.nonzero$log2.tmm.cpm.gf <-
    lapply(setNames(levels(Tissue), levels(Tissue)), function(t) {
      transform_counts_to_tmm_cpm(
        data.conditions.nonzero$counts.gf[[t]],
          data.conditions.nonzero$metadata[data.conditions.nonzero$metadata$Tissue==t,],
          log2=TRUE, shift.min=TRUE)
    })
}

Save

if (!dir.exists(params$output.rds.path)) { dir.create(params$output.rds.path, showWarnings=FALSE) }

# Normalized across all conditions
saveRDS(data.qc2.cor, 
        file.path(params$output.rds.path, 
                  paste(params$species, 
                        params$min.replicates, "reps",
                        "data.qc2.cor.rds", sep="_")))

saveRDS(data.qc2.cor.nonzero, 
        file.path(params$output.rds.path, 
                  paste(params$species, 
                        params$min.replicates, "reps",
                        "data.qc2.cor.nonzero.rds", sep="_")))
# Normalized per condition
saveRDS(data.conditions, 
        file.path(params$output.rds.path, 
                  paste(params$species,
                        params$min.replicates, "reps",
                        "data.conditions.rds", sep="_")))

saveRDS(data.conditions.nonzero, 
        file.path(params$output.rds.path, 
                  paste(params$species,
                        params$min.replicates, "reps",
                        "data.conditions.nonzero.rds", sep="_")))